import pandas as pd
from geopy.geocoders import Nominatim
import pycountry_convert as pc
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.subplots as sp
df=pd.read_csv("Life Expectancy Data.csv")
df
| Country | Year | Status | Life expectancy | Adult Mortality | infant deaths | Alcohol | percentage expenditure | Hepatitis B | Measles | ... | Polio | Total expenditure | Diphtheria | HIV/AIDS | GDP | Population | thinness 1-19 years | thinness 5-9 years | Income composition of resources | Schooling | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | 2015 | Developing | 65.0 | 263.0 | 62 | 0.01 | 71.279624 | 65.0 | 1154 | ... | 6.0 | 8.16 | 65.0 | 0.1 | 584.259210 | 33736494.0 | 17.2 | 17.3 | 0.479 | 10.1 |
| 1 | Afghanistan | 2014 | Developing | 59.9 | 271.0 | 64 | 0.01 | 73.523582 | 62.0 | 492 | ... | 58.0 | 8.18 | 62.0 | 0.1 | 612.696514 | 327582.0 | 17.5 | 17.5 | 0.476 | 10.0 |
| 2 | Afghanistan | 2013 | Developing | 59.9 | 268.0 | 66 | 0.01 | 73.219243 | 64.0 | 430 | ... | 62.0 | 8.13 | 64.0 | 0.1 | 631.744976 | 31731688.0 | 17.7 | 17.7 | 0.470 | 9.9 |
| 3 | Afghanistan | 2012 | Developing | 59.5 | 272.0 | 69 | 0.01 | 78.184215 | 67.0 | 2787 | ... | 67.0 | 8.52 | 67.0 | 0.1 | 669.959000 | 3696958.0 | 17.9 | 18.0 | 0.463 | 9.8 |
| 4 | Afghanistan | 2011 | Developing | 59.2 | 275.0 | 71 | 0.01 | 7.097109 | 68.0 | 3013 | ... | 68.0 | 7.87 | 68.0 | 0.1 | 63.537231 | 2978599.0 | 18.2 | 18.2 | 0.454 | 9.5 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2933 | Zimbabwe | 2004 | Developing | 44.3 | 723.0 | 27 | 4.36 | 0.000000 | 68.0 | 31 | ... | 67.0 | 7.13 | 65.0 | 33.6 | 454.366654 | 12777511.0 | 9.4 | 9.4 | 0.407 | 9.2 |
| 2934 | Zimbabwe | 2003 | Developing | 44.5 | 715.0 | 26 | 4.06 | 0.000000 | 7.0 | 998 | ... | 7.0 | 6.52 | 68.0 | 36.7 | 453.351155 | 12633897.0 | 9.8 | 9.9 | 0.418 | 9.5 |
| 2935 | Zimbabwe | 2002 | Developing | 44.8 | 73.0 | 25 | 4.43 | 0.000000 | 73.0 | 304 | ... | 73.0 | 6.53 | 71.0 | 39.8 | 57.348340 | 125525.0 | 1.2 | 1.3 | 0.427 | 10.0 |
| 2936 | Zimbabwe | 2001 | Developing | 45.3 | 686.0 | 25 | 1.72 | 0.000000 | 76.0 | 529 | ... | 76.0 | 6.16 | 75.0 | 42.1 | 548.587312 | 12366165.0 | 1.6 | 1.7 | 0.427 | 9.8 |
| 2937 | Zimbabwe | 2000 | Developing | 46.0 | 665.0 | 24 | 1.68 | 0.000000 | 79.0 | 1483 | ... | 78.0 | 7.10 | 78.0 | 43.5 | 547.358878 | 12222251.0 | 11.0 | 11.2 | 0.434 | 9.8 |
2938 rows × 22 columns
df.shape
(2938, 22)
df.isna().sum()
Country 0 Year 0 Status 0 Life expectancy 10 Adult Mortality 10 infant deaths 0 Alcohol 194 percentage expenditure 0 Hepatitis B 553 Measles 0 BMI 34 under-five deaths 0 Polio 19 Total expenditure 226 Diphtheria 19 HIV/AIDS 0 GDP 448 Population 652 thinness 1-19 years 34 thinness 5-9 years 34 Income composition of resources 167 Schooling 163 dtype: int64
continent_name = {
'AF': 'Africa',
'AS': 'Asia',
'EU': 'Europe',
'NA': 'North America',
'OC': 'Oceania',
'SA': 'South America',
'AN': 'Antarctica'
}
# Replacing the special cases
df['Country'] = df['Country'].replace({'Bolivia (Plurinational State of)': 'Bolivia', 'Iran (Islamic Republic of)': 'Iran', 'Micronesia (Federated States of)':'Micronesia','Republic of Korea':'Korea, Republic of', 'Korea':"Korea (Democratic People's Republic of)",'The former Yugoslav republic of Macedonia':'North Macedonia','Venezuela (Bolivarian Republic of)':'Venezuela'})
unique_countries = pd.DataFrame(df['Country'].unique(), columns=['Country']) # Taking all the country names
geolocator = Nominatim(user_agent="my_geocoder")
def get_lat_lon(country):
location = geolocator.geocode(country, timeout=5)# locating the country
if location:
return pd.Series({'latitude': location.latitude, 'longitude': location.longitude}) # returning the longitude and latitude
else:
return pd.Series({'latitude': None, 'longitude': None})
def convert_continent(country):
# convert country name to country code
country_code =pc.country_name_to_country_alpha2(country,cn_name_format="default")
# convert country_code to continent code
try:
continent_code = pc.country_alpha2_to_continent_code(country_code)
return continent_name.get(continent_code, None)
except :
return None
unique_countries['Continent']=unique_countries['Country'].apply(convert_continent)
C:\Users\Utilisateur\anaconda3\lib\site-packages\pycountry\db.py:51: UserWarning: Country's official_name not found. Country name provided instead. warnings.warn(warning_message, UserWarning) C:\Users\Utilisateur\anaconda3\lib\site-packages\pycountry\db.py:51: UserWarning: Country's common_name not found. Country name provided instead. warnings.warn(warning_message, UserWarning)
unique_countries[['latitude', 'longitude']] = unique_countries['Country'].apply(get_lat_lon)
unique_countries
| Country | Continent | latitude | longitude | |
|---|---|---|---|---|
| 0 | Afghanistan | Asia | 33.768006 | 66.238514 |
| 1 | Albania | Europe | 1.231526 | -75.892043 |
| 2 | Algeria | Africa | 28.000027 | 2.999983 |
| 3 | Angola | Africa | -11.877577 | 17.569124 |
| 4 | Antigua and Barbuda | North America | 17.223472 | -61.955461 |
| ... | ... | ... | ... | ... |
| 188 | Venezuela | South America | 8.001871 | -66.110932 |
| 189 | Viet Nam | Asia | 15.926666 | 107.965086 |
| 190 | Yemen | Asia | 16.347124 | 47.891527 |
| 191 | Zambia | Africa | -14.518912 | 27.558988 |
| 192 | Zimbabwe | Africa | -18.455496 | 29.746841 |
193 rows × 4 columns
df = pd.merge(df, unique_countries, on='Country', how='left') # merging the created columns
df
| Country | Year | Status | Life expectancy | Adult Mortality | infant deaths | Alcohol | percentage expenditure | Hepatitis B | Measles | ... | HIV/AIDS | GDP | Population | thinness 1-19 years | thinness 5-9 years | Income composition of resources | Schooling | Continent | latitude | longitude | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | 2015 | Developing | 65.0 | 263.0 | 62 | 0.01 | 71.279624 | 65.0 | 1154 | ... | 0.1 | 584.259210 | 33736494.0 | 17.2 | 17.3 | 0.479 | 10.1 | Asia | 33.768006 | 66.238514 |
| 1 | Afghanistan | 2014 | Developing | 59.9 | 271.0 | 64 | 0.01 | 73.523582 | 62.0 | 492 | ... | 0.1 | 612.696514 | 327582.0 | 17.5 | 17.5 | 0.476 | 10.0 | Asia | 33.768006 | 66.238514 |
| 2 | Afghanistan | 2013 | Developing | 59.9 | 268.0 | 66 | 0.01 | 73.219243 | 64.0 | 430 | ... | 0.1 | 631.744976 | 31731688.0 | 17.7 | 17.7 | 0.470 | 9.9 | Asia | 33.768006 | 66.238514 |
| 3 | Afghanistan | 2012 | Developing | 59.5 | 272.0 | 69 | 0.01 | 78.184215 | 67.0 | 2787 | ... | 0.1 | 669.959000 | 3696958.0 | 17.9 | 18.0 | 0.463 | 9.8 | Asia | 33.768006 | 66.238514 |
| 4 | Afghanistan | 2011 | Developing | 59.2 | 275.0 | 71 | 0.01 | 7.097109 | 68.0 | 3013 | ... | 0.1 | 63.537231 | 2978599.0 | 18.2 | 18.2 | 0.454 | 9.5 | Asia | 33.768006 | 66.238514 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2933 | Zimbabwe | 2004 | Developing | 44.3 | 723.0 | 27 | 4.36 | 0.000000 | 68.0 | 31 | ... | 33.6 | 454.366654 | 12777511.0 | 9.4 | 9.4 | 0.407 | 9.2 | Africa | -18.455496 | 29.746841 |
| 2934 | Zimbabwe | 2003 | Developing | 44.5 | 715.0 | 26 | 4.06 | 0.000000 | 7.0 | 998 | ... | 36.7 | 453.351155 | 12633897.0 | 9.8 | 9.9 | 0.418 | 9.5 | Africa | -18.455496 | 29.746841 |
| 2935 | Zimbabwe | 2002 | Developing | 44.8 | 73.0 | 25 | 4.43 | 0.000000 | 73.0 | 304 | ... | 39.8 | 57.348340 | 125525.0 | 1.2 | 1.3 | 0.427 | 10.0 | Africa | -18.455496 | 29.746841 |
| 2936 | Zimbabwe | 2001 | Developing | 45.3 | 686.0 | 25 | 1.72 | 0.000000 | 76.0 | 529 | ... | 42.1 | 548.587312 | 12366165.0 | 1.6 | 1.7 | 0.427 | 9.8 | Africa | -18.455496 | 29.746841 |
| 2937 | Zimbabwe | 2000 | Developing | 46.0 | 665.0 | 24 | 1.68 | 0.000000 | 79.0 | 1483 | ... | 43.5 | 547.358878 | 12222251.0 | 11.0 | 11.2 | 0.434 | 9.8 | Africa | -18.455496 | 29.746841 |
2938 rows × 25 columns
average_life_expectancy_yearly = df.groupby('Year')['Life expectancy '].mean().reset_index()
average_population_yearly = df.groupby('Year')['Population'].mean().reset_index()
fig1 = sp.make_subplots(rows=2, cols=1, subplot_titles=['Average Life Expectancy', 'Average Population'])
fig1.add_trace(
go.Scatter(x=average_life_expectancy_yearly['Year'], y=average_life_expectancy_yearly['Life expectancy '],
mode='lines', name='Life Expectancy'),
row=1, col=1
)
fig1.add_trace(
go.Scatter(x=average_population_yearly['Year'], y=average_population_yearly['Population'],
mode='lines', name='Population'),
row=2, col=1
)
# Update layout for better readability
fig1.update_layout(
title_text="Average Life Expectancy and Population Over Time",
title_font=dict(size=24),
legend=dict(font=dict(size=16)),
font=dict(size=15),
width=1500,
height=700,
)
fig1.show()
average_life_expectancy = df.groupby('Country')['Life expectancy '].mean().reset_index() # calculate the average life expectancy
top5_countries = average_life_expectancy.nlargest(5, 'Life expectancy ') # take the top 10 average life expectancy
bottom5_countries = average_life_expectancy.nsmallest(5, 'Life expectancy ') # take the bottom 10 average life expectancy
# Filtering the Original DataFrame for the Selected Countries
selected_countries = top5_countries['Country'].tolist() + bottom5_countries['Country'].tolist()
filtered_df = df[df['Country'].isin(selected_countries)]
selected_countries
['Japan', 'Sweden', 'Iceland', 'Switzerland', 'France', 'Sierra Leone', 'Central African Republic', 'Lesotho', 'Angola', 'Malawi']
fig2 = px.line(filtered_df, x='Year', y='Life expectancy ', color='Country',title='Life Expectancy Over the Years for the top 5 and bottom 5 Selected Countries')
# Update layout for better readability
fig2.update_layout(
xaxis_title='Year',
yaxis_title='Life Expectancy',
title_font=dict(size=24),
legend_title='Country',
font=dict(size=15),
legend=dict(font=dict(size=16)),
width=1500,
height=1000,
margin=dict(l=20, r=20, t=60, b=20)
)
fig2.show()
df_na = df.dropna(subset=['Continent', 'Life expectancy '])
fig3 = px.violin(df_na, x='Continent', y='Life expectancy ', color='Continent',box=True,title='Violin Plot for Life Expectancy and Population by Continent')
# Update layout for better readability
fig3.update_layout(
xaxis_title='Continent',
yaxis_title='Life Expectancy',
legend_title='Continent',
title_font=dict(size=24),
font=dict(size=15),
legend=dict(font=dict(size=16)),
width=1500,
height=1000,
)
fig3.show()
fig4 = px.pie(
df,
names='Status',
title='Distribution of Countries by Development Status',
color='Status', # Assigning colors based on the 'Status' column
color_discrete_sequence=['#67001F', '#F4A582'],
hole=0.4,
)
# Update layout for better readability
fig4.update_layout(
width=1500,
height=1000,
legend_title_text='Development Status',
legend=dict(font=dict(size=16)),
title_font=dict(size=24),
font=dict(size=15)
)
fig4.show()
df_gdp_avg=df.groupby(['Year', 'Status'])['GDP'].mean().reset_index()
fig5 = px.bar(df_gdp_avg, x='Status', y='GDP', color='Status', title='Average GDP by Country Status Over Years',animation_frame='Year', barmode='group')
# Update layout for better readability
fig5.update_layout(
xaxis_title='Year',
yaxis_title='Average GDP (USD)',
font=dict(size=15),
title_font=dict(size=24),
legend=dict(font=dict(size=16)),
width=1200,
height=800,
yaxis=dict(range=[0, 35000]))
fig5.show()
fig6 = px.box(df, x='Status', y='Life expectancy ',title='Life Expectancy Distribution by Status (2000 to 2015)',animation_frame='Year',category_orders={'Year': sorted(df['Year'].unique())},labels={'Life expectancy': 'Life Expectancy', 'Status': 'Development Status'})
# Update layout for better readability
fig6.update_layout(
xaxis_title='Development Status',
yaxis_title='Life Expectancy',
font=dict(size=15),
title_font=dict(size=24),
legend=dict(font=dict(size=16)),
width=1300,
height=1000,
yaxis=dict(range=[35, 90])
)
fig6.show()
numeric_columns = df.select_dtypes(include=['float64']).columns # Filtering the non-numeric columns in order to do a correlation
correlation_matrix = df[numeric_columns].corr()
fig7 = px.imshow(
correlation_matrix,
color_continuous_scale='GnBu',
labels=dict(x='Features', y='Features', color='Correlation'),
title='Correlation Heatmap',
width=1500,
height=1000,
)
# Update layout for better readability
fig7.update_layout(title_font=dict(size=24))
fig7.show()
fig8 = px.scatter(df, x='Schooling', y='Life expectancy ', trendline="ols",title='Scatter Plot of Schooling vs Life Expectancy',labels={'Schooling': 'Years of Schooling', 'Life expectancy ': 'Life Expectancy'})
# Update layout for better readability
fig8.update_layout(
xaxis_title='Years of Schooling',
yaxis_title='Life Expectancy',
width=1500,
height=1000,
font=dict(size=15),
title_font=dict(size=24),
legend=dict(font=dict(size=16))
)
fig8.update_traces(
line=dict(color='red', dash='solid'),
selector=dict(mode='lines')
)
fig8.show()
df_2014 = df_na[df_na['Year'] == 2014]
fig9 = px.scatter(df_2014, x='Income composition of resources', y='Life expectancy ', color='Continent',hover_name='Country', title='Income Composition vs Life Expectancy',labels={'Income composition of resources': 'Income Composition of Resources', 'Life expectancy': 'Life Expectancy'})
# Update layout for better readability
fig9.update_layout(
xaxis_title='Income Composition of Resources',
yaxis_title='Life Expectancy',
width=1500,
height=1000,
font=dict(size=15),
title_font=dict(size=24),
)
fig9.update_traces(marker=dict(size=8))
fig9.show()
df_avg_bmi = df.groupby(['Continent', 'Year'])[' BMI '].mean().reset_index()
fig10 = px.bar(df_avg_bmi, x='Continent', y=' BMI ', color='Continent',animation_frame='Year',title='Average BMI Over the Years by Continent (2000-2014)',labels={' BMI ': 'Average BMI', 'Continent': 'Continent'},range_y=[df_avg_bmi[' BMI '].min(), df_avg_bmi[' BMI '].max()])
# Update layout for better readability
fig10.update_layout(
xaxis_title='Continent',
yaxis_title='Average BMI',
width=1500,
height=1000,
font=dict(size=15),
title_font=dict(size=24),
legend=dict(font=dict(size=16))
)
fig10.show()
fig11 = px.choropleth(
df,
locations='Country',
locationmode='country names',
color=' thinness 1-19 years',
hover_name='Country',
color_continuous_scale=px.colors.sequential.Plasma,
title='Thinness between 1-19 years old accross countries',
template='plotly',
animation_frame='Year',
category_orders={'Year': sorted(df['Year'].unique())}
)
fig11.update_geos(
resolution=110,
showcoastlines=True,
coastlinecolor="Black",
showland=True,
landcolor="white",
)
# Update layout for better readability
fig11.update_layout(
width=1500,
height=1000,
font=dict(size=15),
title_font=dict(size=24)
)
fig11.show()
fig12 = px.violin(df_na, x='Continent', y='Alcohol', color='Continent',box=True,title='Violin plot on Alcohol Consumption by continent')
# Update layout for better readability
fig12.update_layout(
xaxis_title='Continent',
yaxis_title='Alcohol consumtion (in Liters)',
legend_title='Continent',
legend=dict(font=dict(size=16)),
font=dict(size=15),
title_font=dict(size=24),
width=1500,
height=1000,
)
fig12.show()
df_hiv_measles = df.groupby('Year')[[' HIV/AIDS', 'Measles ']].sum().reset_index()
fig13 = sp.make_subplots(rows=2, cols=1, subplot_titles=['HIV/AIDS', 'Measles'])
bar_trace_hiv = go.Bar(x=df_hiv_measles['Year'], y=df_hiv_measles[' HIV/AIDS'], name='HIV/AIDS')
bar_trace_measles = go.Bar(x=df_hiv_measles['Year'], y=df_hiv_measles['Measles '], name='Measles')
fig13.add_trace(bar_trace_hiv, row=2, col=1)
fig13.add_trace(bar_trace_measles, row=1, col=1)
# Update layout for better readability
fig13.update_layout(
title='Number of Deaths by HIV and Measles (2000-2015)',
xaxis_title='Year',
yaxis_title='Number of Deaths',
width=1500,
height=1000,
font=dict(size=15),
title_font=dict(size=24),
legend=dict(font=dict(size=16))
)
fig13.update_xaxes(tickmode='linear')
fig13.update_xaxes( title_text='Year', row=2, col=1)
fig13.update_yaxes( title_text='Number of Deaths')
fig13.show()
fig14 = px.choropleth(
df,
locations='Country',
locationmode='country names',
color='Adult Mortality',
hover_name='Country',
color_continuous_scale=px.colors.sequential.Plasma,
title='Adult Mortality Across Countries',
template='plotly',
animation_frame='Year',
category_orders={'Year': sorted(df['Year'].unique())}
)
fig14.update_geos(
resolution=110,
showcoastlines=True,
coastlinecolor="Black",
showland=True,
landcolor="white",
)
# Update layout for better readability
fig14.update_layout(
width=1500,
height=1000,
font=dict(size=15),
title_font=dict(size=24)
)
fig14.show()
fig15 = px.choropleth(
df,
locations='Country',
locationmode='country names',
color='infant deaths',
hover_name='Country',
color_continuous_scale=px.colors.sequential.Plasma,
title='Under five mortality Across Countries',
template='plotly',
animation_frame='Year',
category_orders={'Year': sorted(df['Year'].unique())}
)
fig15.update_geos(
resolution=110,
showcoastlines=True,
coastlinecolor="Black",
showland=True,
landcolor="white",
)
# Update layout for better readability
fig15.update_layout(
width=1500,
height=1000,
font=dict(size=15),
title_font=dict(size=24)
)
fig15.show()